import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd # for display and clear_output
import time # for sleep
Last time we discussed using pytorch to construct, train, and use neural networks as regression models. "Regression" refers to the output of continuous values, like rainfall amounts or stock prices.
Today we will modify the code in order to make classification models. These are models that output discrete values, or categorical values, representing class labels for each sample, such as "mask" or "no mask" from images of people, and "excited" or "calm" from speech signals.
The primary changes we need to consider is that we use a different output function for the network, a different loss function for classification, and our target outputs are now class labels.
For regression, we just output the weighted sum of inputs coming into the output layer as our prediction of the correct output for each sample.
For classification, we want to convert these output values into class probabilities, or the probabilities that a given input sample should be classified as each of the possible classes. So, if we have 2 classes, like "cat" or "dog", we need 2 outputs. For these two outputs to be probabilities, we want each one to be between 0 and 1 and for a given sample we want the two values to sum to 1.
We will accomplish this by passing the outputs of the neural network through a "softmax" function. Let's call the two outputs of our network for input sample $n$, $p_{n,0}$ and $p_{n,1}$. where
$$\begin{align*} 0 \le p_{n,0} \le 1\\ 0 \le p_{n,1} \le 1\\ p_{n,0} + p_{n,1} = 1 \end{align*}$$The softmax function that converts our network outputs, $y_{n,0}$, and $y_{n, 1}$, to class probabilities uses each $y$ as an exponent of base $e$ and dividing each result by the sum of these exponentiated values:
$$ \begin{align*} p_{n,i} = \frac{e^{y_{n,i}}}{\sum_{j=0}^K e^{y_{n,j}}} \end{align*}$$where $K$ is the number of classes.
Let's do this in python.
def softmax(Y):
'''Y is n_samples X n_classes'''
expY = np.exp(Y)
P = expY / np.sum(expY, axis=1)
return P
softmax?
Y = np.array([[-2.3, 8.2]])
Y
array([[-2.3, 8.2]])
P = softmax(Y)
P
array([[2.75356911e-05, 9.99972464e-01]])
np.sum(P)
1.0
Does this work for multiple samples, in rows of Y
?
n_samples = 5
n_classes = 3
Y = np.random.uniform(-10, 10, size=(n_samples, n_classes))
Y
array([[-9.25841623, 4.20027187, -5.06507049], [ 2.15564981, -7.69344863, -1.59212194], [-5.78538943, -0.60967184, 4.57905291], [ 4.54206011, 7.35250898, -0.59140415], [ 3.51171099, -0.94854703, -9.98066263]])
P = softmax(Y)
P
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-8-3461d9de5ee0> in <module> ----> 1 P = softmax(Y) 2 P <ipython-input-2-c4371ee16ee1> in softmax(Y) 2 '''Y is n_samples X n_classes''' 3 expY = np.exp(Y) ----> 4 P = expY / np.sum(expY, axis=1) 5 return P ValueError: operands could not be broadcast together with shapes (5,3) (5,)
How can we fix this?
np.sum(Y, axis=1, keepdims=True)
array([[-10.12321484], [ -7.12992075], [ -1.81600836], [ 11.30316494], [ -7.41749867]])
def softmax(Y):
'''Y is n_samples X n_classes'''
expY = np.exp(Y)
P = expY / np.sum(expY, axis=1, keepdims=True)
return P
P = softmax(Y)
P
array([[1.42864493e-06, 9.99903932e-01, 9.46392344e-05], [9.76922165e-01, 5.15763805e-05, 2.30262585e-02], [3.13581200e-05, 5.54798920e-03, 9.94420653e-01], [5.67431529e-02, 9.42922284e-01, 3.34563274e-04], [9.88571362e-01, 1.14272723e-02, 1.36566637e-06]])
P.sum(axis=1)
array([1., 1., 1., 1., 1.])
Once we have class probabilities, how do we convert these to classes, or categories?
We just created output matrix Y
with 5 samples and 3 classes representing, let's say, "hot", "warm", and "cold". Now we want to know for each of the 5 samples, was that sample "hot", "warm", or "cold"?
To do this, we just need to identify which of the three class probabilities was largest for each sample. Hey, maybe numpy
has a function for this?
np.argmax(P, axis=1)
array([1, 0, 2, 1, 0])
If we have an np.array
of class names, we can use these integers as indices into the class names.
class_names = np.array(['hot', 'warm', 'cold'])
class_names[np.argmax(P, axis=1)]
array(['warm', 'hot', 'cold', 'warm', 'hot'], dtype='<U4')
Whenever we are dividing, we have to watch out for divide-by-zero errors. This could happen in our softmax
function if all of the Y
values are large negative values.
Y = np.random.uniform(-10000010, -10000000, size=(5, 3))
Y
array([[-10000006.34200973, -10000008.96672827, -10000007.67589873], [-10000009.81675825, -10000004.74712031, -10000008.14881062], [-10000005.20780124, -10000005.33599189, -10000004.97343861], [-10000001.5391905 , -10000007.74223229, -10000001.4729939 ], [-10000006.17404579, -10000000.78783279, -10000006.22983135]])
np.exp(Y)
array([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
softmax(Y)
<ipython-input-10-79349de2e72c>:4: RuntimeWarning: invalid value encountered in true_divide P = expY / np.sum(expY, axis=1, keepdims=True)
array([[nan, nan, nan], [nan, nan, nan], [nan, nan, nan], [nan, nan, nan], [nan, nan, nan]])
We can deal with that by a simple division of the numerator and denominator by $e^{\text{max}_j(y_{n, j})}$
def softmax(Y):
'''Y is n_samples X n_classes'''
maxY_by_row = np.max(Y, axis=1, keepdims=True)
expY = np.exp(Y - maxY_by_row)
P = expY / np.sum(expY, axis=1, keepdims=True)
return P
softmax(Y)
array([[7.48552941e-01, 5.42402578e-02, 1.97206801e-01], [6.04529191e-03, 9.61906944e-01, 3.20477636e-02], [3.18087559e-01, 2.79817061e-01, 4.02095379e-01], [4.82984454e-01, 9.77206464e-04, 5.16038339e-01], [4.53884076e-03, 9.91168587e-01, 4.29257199e-03]])
For regression, we used the torch.nn.MSELoss()
loss function, because we wanted to minimize the mean-squared-error between all training desired target values and the outputs produced by the neural network.
For classification, we will instead want to maximize the data likelihood, which is the product of all of the correct class probabilities over all samples. Remember that we are using gradient descent to optimize our loss functions. Now, the gradient (or derivative) of a product of a bunch of things is a very computationally heavy calculation. (Why?) So, instead of optimizing this product of probabilities, we will optimize the log of that product, which converts it into a sum of logs of those probabilities. We can do this because the weight values that optimize the product of probabilities are the same weight values that optimize the log of that product!
So we want to maximize the log-likelihood. Recall that the torch.optim
functions are designed to minimize a loss (hence the name "loss"). Since we want to maximize the log-likelihood, we must define the negative-log-likelihood to be used by torch.optim
.
To do this in our python code, we simply replace
torch.nn.MSELoss()
with
torch.nn.NLLLoss()
The final step to convert our code from regression to classification is to construct the correct target, T
, matrix. For regression, we would create for T
an n_samples
x n_outputs
matrix of desired output values. For classification, we instead create a matrix T
of n_samples
values, regardless of how many outputs, or classes, we will have. The values of T
must be from the set $\{0, 1, \ldots, K-1\}$ where $K$ is the number of different class labels we have. We can get the number of different class labels using python like
n_classes = len(np.unique(T))
Let's start with a toy problem. Say we have samples, each consisting of three integers. Define the classes as
Making different data to have one input, so can make plots in the animation below.
n_samples = 100
X = np.random.uniform(0, 10, size=(n_samples, 1))
T = np.array([0 if (s[0] < 3) else
1 if (s[0] < 6) else
2
for s in X]).reshape(-1, 1)
Xtest = np.random.uniform(0, 10, size=(n_samples, 1))
Ttest = np.array([0 if (s[0] < 3) else
1 if (s[0] < 6) else
2
for s in Xtest]).reshape(-1, 1)
X, T
(array([[0.74527494], [5.0835637 ], [6.78310398], [4.61419519], [3.72353747], [6.03706642], [6.08794542], [1.65337227], [8.52025903], [4.14847294], [9.12528426], [4.60843185], [6.85429187], [9.72365158], [4.75608714], [7.50723516], [2.38985605], [8.96662919], [1.78390658], [3.67431871], [1.25314593], [5.39825349], [7.88542309], [2.38447133], [9.33439215], [0.4903726 ], [1.10060789], [2.96704847], [0.6130978 ], [1.06123726], [3.038108 ], [1.23809719], [8.27166477], [6.91655985], [0.81580073], [7.60606028], [8.42448999], [4.69311645], [3.14885818], [9.34106032], [9.97937935], [3.56643008], [7.9738946 ], [5.77928201], [0.84270622], [4.77074341], [8.08889513], [3.96722853], [9.86619619], [0.36044911], [4.00387576], [2.01615881], [3.3891037 ], [9.58775457], [0.39485132], [5.09102195], [4.07048577], [8.63362479], [3.20364354], [0.86357438], [6.18334872], [4.25643311], [1.84496523], [1.23901617], [1.2242609 ], [0.65864965], [3.00952802], [5.6775327 ], [3.47742279], [3.96001918], [4.248112 ], [0.66483946], [3.79403982], [3.77625988], [0.20251507], [9.22809818], [1.67147073], [5.82970945], [3.74473851], [3.03438061], [4.01882699], [5.91906059], [2.22921631], [5.87101102], [5.83028119], [4.8643471 ], [4.49265557], [4.92287404], [8.82721534], [1.11244688], [7.4515501 ], [1.30582537], [0.25648592], [1.79978752], [2.13956547], [2.12839789], [7.7676748 ], [7.32798922], [7.22492687], [4.1238338 ]]), array([[0], [1], [2], [1], [1], [2], [2], [0], [2], [1], [2], [1], [2], [2], [1], [2], [0], [2], [0], [1], [0], [1], [2], [0], [2], [0], [0], [0], [0], [0], [1], [0], [2], [2], [0], [2], [2], [1], [1], [2], [2], [1], [2], [1], [0], [1], [2], [1], [2], [0], [1], [0], [1], [2], [0], [1], [1], [2], [1], [0], [2], [1], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [1], [1], [0], [2], [0], [1], [1], [1], [1], [1], [0], [1], [1], [1], [1], [1], [2], [0], [2], [0], [0], [0], [0], [0], [2], [2], [2], [1]]))
np.hstack((X, T))
array([[0.74527494, 0. ], [5.0835637 , 1. ], [6.78310398, 2. ], [4.61419519, 1. ], [3.72353747, 1. ], [6.03706642, 2. ], [6.08794542, 2. ], [1.65337227, 0. ], [8.52025903, 2. ], [4.14847294, 1. ], [9.12528426, 2. ], [4.60843185, 1. ], [6.85429187, 2. ], [9.72365158, 2. ], [4.75608714, 1. ], [7.50723516, 2. ], [2.38985605, 0. ], [8.96662919, 2. ], [1.78390658, 0. ], [3.67431871, 1. ], [1.25314593, 0. ], [5.39825349, 1. ], [7.88542309, 2. ], [2.38447133, 0. ], [9.33439215, 2. ], [0.4903726 , 0. ], [1.10060789, 0. ], [2.96704847, 0. ], [0.6130978 , 0. ], [1.06123726, 0. ], [3.038108 , 1. ], [1.23809719, 0. ], [8.27166477, 2. ], [6.91655985, 2. ], [0.81580073, 0. ], [7.60606028, 2. ], [8.42448999, 2. ], [4.69311645, 1. ], [3.14885818, 1. ], [9.34106032, 2. ], [9.97937935, 2. ], [3.56643008, 1. ], [7.9738946 , 2. ], [5.77928201, 1. ], [0.84270622, 0. ], [4.77074341, 1. ], [8.08889513, 2. ], [3.96722853, 1. ], [9.86619619, 2. ], [0.36044911, 0. ], [4.00387576, 1. ], [2.01615881, 0. ], [3.3891037 , 1. ], [9.58775457, 2. ], [0.39485132, 0. ], [5.09102195, 1. ], [4.07048577, 1. ], [8.63362479, 2. ], [3.20364354, 1. ], [0.86357438, 0. ], [6.18334872, 2. ], [4.25643311, 1. ], [1.84496523, 0. ], [1.23901617, 0. ], [1.2242609 , 0. ], [0.65864965, 0. ], [3.00952802, 1. ], [5.6775327 , 1. ], [3.47742279, 1. ], [3.96001918, 1. ], [4.248112 , 1. ], [0.66483946, 0. ], [3.79403982, 1. ], [3.77625988, 1. ], [0.20251507, 0. ], [9.22809818, 2. ], [1.67147073, 0. ], [5.82970945, 1. ], [3.74473851, 1. ], [3.03438061, 1. ], [4.01882699, 1. ], [5.91906059, 1. ], [2.22921631, 0. ], [5.87101102, 1. ], [5.83028119, 1. ], [4.8643471 , 1. ], [4.49265557, 1. ], [4.92287404, 1. ], [8.82721534, 2. ], [1.11244688, 0. ], [7.4515501 , 2. ], [1.30582537, 0. ], [0.25648592, 0. ], [1.79978752, 0. ], [2.13956547, 0. ], [2.12839789, 0. ], [7.7676748 , 2. ], [7.32798922, 2. ], [7.22492687, 2. ], [4.1238338 , 1. ]])
np.sum(T == 0), np.sum(T == 1), np.sum(T == 2)
(32, 39, 29)
import torch
torch.__version__
'1.4.0'
Remember that pytorch
requires inputs of single precision float. The NLLLoss
function requires target class labels to be one-dimensional and double ints.
torch.from_numpy(X).float()
tensor([[0.7453], [5.0836], [6.7831], [4.6142], [3.7235], [6.0371], [6.0879], [1.6534], [8.5203], [4.1485], [9.1253], [4.6084], [6.8543], [9.7237], [4.7561], [7.5072], [2.3899], [8.9666], [1.7839], [3.6743], [1.2531], [5.3983], [7.8854], [2.3845], [9.3344], [0.4904], [1.1006], [2.9670], [0.6131], [1.0612], [3.0381], [1.2381], [8.2717], [6.9166], [0.8158], [7.6061], [8.4245], [4.6931], [3.1489], [9.3411], [9.9794], [3.5664], [7.9739], [5.7793], [0.8427], [4.7707], [8.0889], [3.9672], [9.8662], [0.3604], [4.0039], [2.0162], [3.3891], [9.5878], [0.3949], [5.0910], [4.0705], [8.6336], [3.2036], [0.8636], [6.1833], [4.2564], [1.8450], [1.2390], [1.2243], [0.6586], [3.0095], [5.6775], [3.4774], [3.9600], [4.2481], [0.6648], [3.7940], [3.7763], [0.2025], [9.2281], [1.6715], [5.8297], [3.7447], [3.0344], [4.0188], [5.9191], [2.2292], [5.8710], [5.8303], [4.8643], [4.4927], [4.9229], [8.8272], [1.1124], [7.4516], [1.3058], [0.2565], [1.7998], [2.1396], [2.1284], [7.7677], [7.3280], [7.2249], [4.1238]])
torch.from_numpy(T).reshape(-1).long()
tensor([0, 1, 2, 1, 1, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2, 0, 2, 0, 1, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 0, 2, 2, 0, 2, 2, 1, 1, 2, 2, 1, 2, 1, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0, 1, 1, 2, 1, 0, 2, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2, 1])
np.unique(T)
array([0, 1, 2])
n_inputs = X.shape[1]
n_classes = len(np.unique(T))
Xt = torch.from_numpy(X).float()
Tt = torch.from_numpy(T).reshape(-1).long()
Xtestt = torch.from_numpy(Xtest).float()
Ttestt = torch.from_numpy(Ttest).reshape(-1).long()
nnet = torch.nn.Sequential(torch.nn.Linear(n_inputs, 10), torch.nn.Tanh(),
torch.nn.Linear(10, 20), torch.nn.Tanh(),
torch.nn.Linear(20, 10), torch.nn.Tanh(),
torch.nn.Linear(10, n_classes), torch.nn.LogSoftmax(dim=1))
learning_rate = 0.01
n_epochs = 10000
optimizer = torch.optim.SGD(nnet.parameters(), lr=learning_rate)
nll_f = torch.nn.NLLLoss()
likelihood_trace = []
likelihood_test_trace = []
fig = plt.figure(figsize=(10, 12))
def forward_all_layers(X):
Ys = [X]
for layer in nnet:
Ys.append(layer(Ys[-1]))
return Ys[1:]
for epoch in range(n_epochs):
logP = nnet(Xt)
nll = nll_f(logP, Tt)
# mse = mse_f(Y, Tt)
optimizer.zero_grad()
nll.backward()
optimizer.step()
# error traces for plotting
likelihood_trace.append((-nll).exp())
logPtest = nnet(Xtestt)
likelihood_test_trace.append((-nll_f(logPtest, Ttestt)).exp())
if epoch % 1000 == 0 or epoch == n_epochs-1:
plt.clf()
n_hidden_layers = (len(nnet) - 1) //2
nplots = 2 + n_hidden_layers
plt.subplot(nplots, 1, 1)
plt.plot(likelihood_trace[:epoch])
plt.plot(likelihood_test_trace[:epoch])
# plt.ylim(0, 0.7)
plt.xlabel('Epochs')
plt.ylabel('Likelihood')
plt.legend(('Train','Test'), loc='upper left')
plt.subplot(nplots, 1, 2)
classes = logPtest.argmax(axis=1)
order = np.argsort(X, axis=0).reshape(-1)
plt.plot(X[order,:], T[order,:], 'o-', label='Train')
order = np.argsort(Xtest, axis=0).reshape(-1)
plt.plot(Xtest[order, :], Ttest[order, :], 'o-', label='Test')
plt.plot(Xtest[order, :], classes[order], 'o-')
plt.legend(('Training','Testing','Model'), loc='upper left')
plt.xlabel('$x$')
plt.ylabel('Actual and Predicted Class')
Ys = forward_all_layers(Xt)
Z = Ys[:-1]
ploti = 2
for layeri in range(n_hidden_layers, 0, -1):
ploti += 1
plt.subplot(nplots, 1, ploti)
order = np.argsort(X, axis=0).reshape(-1)
plt.plot(X[order,:], Z[layeri * 2 - 1][order,:].detach())
plt.xlabel('$x$')
plt.ylabel(f'Hidden Layer {layeri}');
ipd.clear_output(wait=True)
ipd.display(fig)
ipd.clear_output(wait=True)